high_population_data <- read.csv("high_popularity_spotify_data.csv")
low_population_data <- read.csv("low_popularity_spotify_data.csv")

spotify_data <- rbind(high_population_data, low_population_data)

head(spotify_data)
##   energy   tempo danceability playlist_genre loudness liveness valence
## 1  0.592 157.969        0.521            pop   -7.777   0.1220   0.535
## 2  0.507 104.978        0.747            pop  -10.171   0.1170   0.438
## 3  0.808 108.548        0.554            pop   -4.169   0.1590   0.372
## 4  0.910 112.966        0.670            pop   -4.070   0.3040   0.786
## 5  0.783 149.027        0.777            pop   -4.477   0.3550   0.939
## 6  0.582 116.712        0.700            pop   -5.960   0.0881   0.785
##            track_artist time_signature speechiness track_popularity
## 1 Lady Gaga, Bruno Mars              3      0.0304              100
## 2         Billie Eilish              4      0.0358               97
## 3         Gracie Abrams              4      0.0368               93
## 4     Sabrina Carpenter              4      0.0634               81
## 5      ROSÉ, Bruno Mars              4      0.2600               98
## 6         Chappell Roan              4      0.0356               94
##                                                 track_href
## 1 https://api.spotify.com/v1/tracks/2plbrEY59IikOBgBGLjaoe
## 2 https://api.spotify.com/v1/tracks/6dOtVTDdiauQNBQEDOtlAB
## 3 https://api.spotify.com/v1/tracks/7ne4VBA60CxGM75vw0EYad
## 4 https://api.spotify.com/v1/tracks/1d7Ptw3qYcfpdLNL5REhtJ
## 5 https://api.spotify.com/v1/tracks/5vNRhkKd0yEAg8suGBpjeY
## 6 https://api.spotify.com/v1/tracks/0WbMK4wrZ1wFSty9F7FCgu
##                                    uri          track_album_name
## 1 spotify:track:2plbrEY59IikOBgBGLjaoe          Die With A Smile
## 2 spotify:track:6dOtVTDdiauQNBQEDOtlAB      HIT ME HARD AND SOFT
## 3 spotify:track:7ne4VBA60CxGM75vw0EYad The Secret of Us (Deluxe)
## 4 spotify:track:1d7Ptw3qYcfpdLNL5REhtJ            Short n' Sweet
## 5 spotify:track:5vNRhkKd0yEAg8suGBpjeY                      APT.
## 6 spotify:track:0WbMK4wrZ1wFSty9F7FCgu          Good Luck, Babe!
##      playlist_name
## 1 Today's Top Hits
## 2 Today's Top Hits
## 3 Today's Top Hits
## 4 Today's Top Hits
## 5 Today's Top Hits
## 6 Today's Top Hits
##                                                       analysis_url
## 1 https://api.spotify.com/v1/audio-analysis/2plbrEY59IikOBgBGLjaoe
## 2 https://api.spotify.com/v1/audio-analysis/6dOtVTDdiauQNBQEDOtlAB
## 3 https://api.spotify.com/v1/audio-analysis/7ne4VBA60CxGM75vw0EYad
## 4 https://api.spotify.com/v1/audio-analysis/1d7Ptw3qYcfpdLNL5REhtJ
## 5 https://api.spotify.com/v1/audio-analysis/5vNRhkKd0yEAg8suGBpjeY
## 6 https://api.spotify.com/v1/audio-analysis/0WbMK4wrZ1wFSty9F7FCgu
##                 track_id         track_name track_album_release_date
## 1 2plbrEY59IikOBgBGLjaoe   Die With A Smile               2024-08-16
## 2 6dOtVTDdiauQNBQEDOtlAB BIRDS OF A FEATHER               2024-05-17
## 3 7ne4VBA60CxGM75vw0EYad     That’s So True               2024-10-18
## 4 1d7Ptw3qYcfpdLNL5REhtJ              Taste               2024-08-23
## 5 5vNRhkKd0yEAg8suGBpjeY               APT.               2024-10-18
## 6 0WbMK4wrZ1wFSty9F7FCgu   Good Luck, Babe!               2024-04-05
##   instrumentalness         track_album_id mode key duration_ms acousticness
## 1           0.0000 10FLjwfpbxLmW8c25Xyc2N    0   6      251668       0.3080
## 2           0.0608 7aJuG4TFXa2hmE4z1yxc3n    1   2      210373       0.2000
## 3           0.0000 0hBRqPYPXhr1RkTDG3n4Mk    1   1      166300       0.2140
## 4           0.0000 4B4Elma4nNDUyl6D5PvQkj    0   0      157280       0.0939
## 5           0.0000 2IYQwwgxgOIn7t3iF6ufFD    0   0      169917       0.0283
## 6           0.0000 1WAjjRMfZjEXtB0lQrAw6Q    0  11      218424       0.0502
##                       id playlist_subgenre           type
## 1 2plbrEY59IikOBgBGLjaoe        mainstream audio_features
## 2 6dOtVTDdiauQNBQEDOtlAB        mainstream audio_features
## 3 7ne4VBA60CxGM75vw0EYad        mainstream audio_features
## 4 1d7Ptw3qYcfpdLNL5REhtJ        mainstream audio_features
## 5 5vNRhkKd0yEAg8suGBpjeY        mainstream audio_features
## 6 0WbMK4wrZ1wFSty9F7FCgu        mainstream audio_features
##              playlist_id
## 1 37i9dQZF1DXcBWIGoYBM5M
## 2 37i9dQZF1DXcBWIGoYBM5M
## 3 37i9dQZF1DXcBWIGoYBM5M
## 4 37i9dQZF1DXcBWIGoYBM5M
## 5 37i9dQZF1DXcBWIGoYBM5M
## 6 37i9dQZF1DXcBWIGoYBM5M
View(spotify_data)
#3
str(spotify_data)
## 'data.frame':    4831 obs. of  29 variables:
##  $ energy                  : num  0.592 0.507 0.808 0.91 0.783 0.582 0.561 0.247 0.416 0.722 ...
##  $ tempo                   : num  158 105 109 113 149 ...
##  $ danceability            : num  0.521 0.747 0.554 0.67 0.777 0.7 0.669 0.467 0.492 0.769 ...
##  $ playlist_genre          : chr  "pop" "pop" "pop" "pop" ...
##  $ loudness                : num  -7.78 -10.17 -4.17 -4.07 -4.48 ...
##  $ liveness                : num  0.122 0.117 0.159 0.304 0.355 0.0881 0.0954 0.17 0.203 0.111 ...
##  $ valence                 : num  0.535 0.438 0.372 0.786 0.939 0.785 0.841 0.126 0.297 0.57 ...
##  $ track_artist            : chr  "Lady Gaga, Bruno Mars" "Billie Eilish" "Gracie Abrams" "Sabrina Carpenter" ...
##  $ time_signature          : int  3 4 4 4 4 4 4 4 4 4 ...
##  $ speechiness             : num  0.0304 0.0358 0.0368 0.0634 0.26 0.0356 0.0411 0.0431 0.0254 0.0507 ...
##  $ track_popularity        : int  100 97 93 81 98 94 88 93 71 92 ...
##  $ track_href              : chr  "https://api.spotify.com/v1/tracks/2plbrEY59IikOBgBGLjaoe" "https://api.spotify.com/v1/tracks/6dOtVTDdiauQNBQEDOtlAB" "https://api.spotify.com/v1/tracks/7ne4VBA60CxGM75vw0EYad" "https://api.spotify.com/v1/tracks/1d7Ptw3qYcfpdLNL5REhtJ" ...
##  $ uri                     : chr  "spotify:track:2plbrEY59IikOBgBGLjaoe" "spotify:track:6dOtVTDdiauQNBQEDOtlAB" "spotify:track:7ne4VBA60CxGM75vw0EYad" "spotify:track:1d7Ptw3qYcfpdLNL5REhtJ" ...
##  $ track_album_name        : chr  "Die With A Smile" "HIT ME HARD AND SOFT" "The Secret of Us (Deluxe)" "Short n' Sweet" ...
##  $ playlist_name           : chr  "Today's Top Hits" "Today's Top Hits" "Today's Top Hits" "Today's Top Hits" ...
##  $ analysis_url            : chr  "https://api.spotify.com/v1/audio-analysis/2plbrEY59IikOBgBGLjaoe" "https://api.spotify.com/v1/audio-analysis/6dOtVTDdiauQNBQEDOtlAB" "https://api.spotify.com/v1/audio-analysis/7ne4VBA60CxGM75vw0EYad" "https://api.spotify.com/v1/audio-analysis/1d7Ptw3qYcfpdLNL5REhtJ" ...
##  $ track_id                : chr  "2plbrEY59IikOBgBGLjaoe" "6dOtVTDdiauQNBQEDOtlAB" "7ne4VBA60CxGM75vw0EYad" "1d7Ptw3qYcfpdLNL5REhtJ" ...
##  $ track_name              : chr  "Die With A Smile" "BIRDS OF A FEATHER" "That’s So True" "Taste" ...
##  $ track_album_release_date: chr  "2024-08-16" "2024-05-17" "2024-10-18" "2024-08-23" ...
##  $ instrumentalness        : num  0.00 6.08e-02 0.00 0.00 0.00 0.00 9.62e-03 2.71e-04 8.61e-05 2.56e-06 ...
##  $ track_album_id          : chr  "10FLjwfpbxLmW8c25Xyc2N" "7aJuG4TFXa2hmE4z1yxc3n" "0hBRqPYPXhr1RkTDG3n4Mk" "4B4Elma4nNDUyl6D5PvQkj" ...
##  $ mode                    : int  0 1 1 0 0 0 1 0 1 0 ...
##  $ key                     : int  6 2 1 0 0 11 10 6 11 11 ...
##  $ duration_ms             : int  251668 210373 166300 157280 169917 218424 169698 261467 211979 256000 ...
##  $ acousticness            : num  0.308 0.2 0.214 0.0939 0.0283 0.0502 0.495 0.612 0.686 0.0584 ...
##  $ id                      : chr  "2plbrEY59IikOBgBGLjaoe" "6dOtVTDdiauQNBQEDOtlAB" "7ne4VBA60CxGM75vw0EYad" "1d7Ptw3qYcfpdLNL5REhtJ" ...
##  $ playlist_subgenre       : chr  "mainstream" "mainstream" "mainstream" "mainstream" ...
##  $ type                    : chr  "audio_features" "audio_features" "audio_features" "audio_features" ...
##  $ playlist_id             : chr  "37i9dQZF1DXcBWIGoYBM5M" "37i9dQZF1DXcBWIGoYBM5M" "37i9dQZF1DXcBWIGoYBM5M" "37i9dQZF1DXcBWIGoYBM5M" ...
#29 coloumns, 4831 rows

#4
summary(spotify_data)
##      energy             tempo         danceability    playlist_genre    
##  Min.   :0.000202   Min.   : 48.23   Min.   :0.0589   Length:4831       
##  1st Qu.:0.442250   1st Qu.: 96.06   1st Qu.:0.5250   Class :character  
##  Median :0.633000   Median :118.06   Median :0.6530   Mode  :character  
##  Mean   :0.586691   Mean   :118.27   Mean   :0.6223                     
##  3rd Qu.:0.777000   3rd Qu.:136.72   3rd Qu.:0.7580                     
##  Max.   :0.998000   Max.   :241.43   Max.   :0.9790                     
##  NA's   :1          NA's   :1        NA's   :1                          
##     loudness          liveness         valence       track_artist      
##  Min.   :-48.069   Min.   :0.0210   Min.   :0.0296   Length:4831       
##  1st Qu.:-10.298   1st Qu.:0.0954   1st Qu.:0.2750   Class :character  
##  Median : -7.191   Median :0.1180   Median :0.4830   Mode  :character  
##  Mean   : -9.282   Mean   :0.1676   Mean   :0.4819                     
##  3rd Qu.: -5.337   3rd Qu.:0.1950   3rd Qu.:0.6900                     
##  Max.   :  1.318   Max.   :0.9790   Max.   :0.9870                     
##  NA's   :1         NA's   :1        NA's   :1                          
##  time_signature   speechiness     track_popularity  track_href       
##  Min.   :1.000   Min.   :0.0219   Min.   : 11.00   Length:4831       
##  1st Qu.:4.000   1st Qu.:0.0386   1st Qu.: 41.00   Class :character  
##  Median :4.000   Median :0.0561   Median : 56.00   Mode  :character  
##  Mean   :3.937   Mean   :0.1017   Mean   : 54.76                     
##  3rd Qu.:4.000   3rd Qu.:0.1180   3rd Qu.: 72.00                     
##  Max.   :5.000   Max.   :0.9270   Max.   :100.00                     
##  NA's   :1       NA's   :1                                           
##      uri            track_album_name   playlist_name      analysis_url      
##  Length:4831        Length:4831        Length:4831        Length:4831       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##    track_id          track_name        track_album_release_date
##  Length:4831        Length:4831        Length:4831             
##  Class :character   Class :character   Class :character        
##  Mode  :character   Mode  :character   Mode  :character        
##                                                                
##                                                                
##                                                                
##                                                                
##  instrumentalness    track_album_id          mode             key        
##  Min.   :0.0000000   Length:4831        Min.   :0.0000   Min.   : 0.000  
##  1st Qu.:0.0000000   Class :character   1st Qu.:0.0000   1st Qu.: 2.000  
##  Median :0.0000913   Mode  :character   Median :1.0000   Median : 5.000  
##  Mean   :0.2010526                      Mean   :0.5621   Mean   : 5.233  
##  3rd Qu.:0.2005000                      3rd Qu.:1.0000   3rd Qu.: 8.000  
##  Max.   :0.9910000                      Max.   :1.0000   Max.   :11.000  
##  NA's   :1                              NA's   :1        NA's   :1       
##   duration_ms       acousticness            id            playlist_subgenre 
##  Min.   :  35375   Min.   :0.0000036   Length:4831        Length:4831       
##  1st Qu.: 159000   1st Qu.:0.0529250   Class :character   Class :character  
##  Median : 194866   Median :0.2245000   Mode  :character   Mode  :character  
##  Mean   : 206151   Mean   :0.3412170                                        
##  3rd Qu.: 233478   3rd Qu.:0.5900000                                        
##  Max.   :1355260   Max.   :0.9960000                                        
##  NA's   :1         NA's   :1                                                
##      type           playlist_id       
##  Length:4831        Length:4831       
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
##                                       
## 

Introduction

A description of your dataset (what the variables and observations represent)

The dataset is spotify songs and their respective attributes given by spotify.

Each observation is a song along with its data.

The following is every variable used in the data and what they represent:

Energy - A measure of intensity and activity. Typically, energetic tracks feel fast, loud, and noisy.

Tempo - The speed of a track, measured in beats per minute (BPM).

Danceability - A score describing how suitable a track is for dancing based on tempo, rhythm stability, beat strength and overall regularity.

Loudness - The overall loudness of a track in decibels (dB). Higher values indicate louder tracks overall.

Liveness - The likelihood of a track being performed live. Higher values suggest more audience presence.

Valence - The overall musical positiveness(emotion) of a track. High valence sounds happy; low valence sounds sad or angry.

Speechiness - Measures the presence of spoken words.

Instrumentalness - The likelihood a track contains no vocals. Values closer to 1.0 suggest solely instrumental tracks.

Mode - Indicates the modality of the track.

Key - The musical key, represented as an integer from 0 to 11, mapping to standard Pitch class notation.

Duration_ms - The length of the track in milliseconds.

Acousticness - A confidence measure of whether a track is acoustic(1) or not(0).

Track Name - The name of the track.

Track Artist - The artist(s) performing the track.

Track Album - Name The album in which the track is featured.

Track Album Release Date - The release date of the album containing the track.

Track ID - A unique identifier assigned to the track by Spotify.

Track Album ID - A unique identifier for the album.

Playlist Name - The name of the playlist where the track is included.

Playlist Genre - The main genre associated with the playlist (e.g., pop, rock, classical).

Playlist Subgenre - A more specific subgenre tied to the playlist (e.g., indie pop, punk rock).

Playlist ID - A unique identifier for the playlist.

Track Popularity - A score (0–100) which is calculated based on total number of streams in relation to other songs.

A description of your motivation for studying this dataset

Basic summary statistics about your dataset

4831 Rows

30 Columns

Variables like Energy, danceability and other variables generated by spotify and not taken directly from the song are on a scale from 0-1

Variables like Loudness start at around 0dB as the loudest and go into negative signifying it is X decibels quieter than normal output.

Tempo is on a scale of beats per minute.

Duration_ms is the length of a song in milliseconds.

Visualizations, Dataset manipulations, and Statistical analyses

high_population_data <- read.csv("high_popularity_spotify_data.csv")
low_population_data <- read.csv("low_popularity_spotify_data.csv")
spotify_data <- rbind(high_population_data, low_population_data)
write.csv(spotify_data, "spotify_data.csv", row.names = FALSE)



library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(purrr)
library(ggcorrplot)

head(spotify_data)
##   energy   tempo danceability playlist_genre loudness liveness valence
## 1  0.592 157.969        0.521            pop   -7.777   0.1220   0.535
## 2  0.507 104.978        0.747            pop  -10.171   0.1170   0.438
## 3  0.808 108.548        0.554            pop   -4.169   0.1590   0.372
## 4  0.910 112.966        0.670            pop   -4.070   0.3040   0.786
## 5  0.783 149.027        0.777            pop   -4.477   0.3550   0.939
## 6  0.582 116.712        0.700            pop   -5.960   0.0881   0.785
##            track_artist time_signature speechiness track_popularity
## 1 Lady Gaga, Bruno Mars              3      0.0304              100
## 2         Billie Eilish              4      0.0358               97
## 3         Gracie Abrams              4      0.0368               93
## 4     Sabrina Carpenter              4      0.0634               81
## 5      ROSÉ, Bruno Mars              4      0.2600               98
## 6         Chappell Roan              4      0.0356               94
##                                                 track_href
## 1 https://api.spotify.com/v1/tracks/2plbrEY59IikOBgBGLjaoe
## 2 https://api.spotify.com/v1/tracks/6dOtVTDdiauQNBQEDOtlAB
## 3 https://api.spotify.com/v1/tracks/7ne4VBA60CxGM75vw0EYad
## 4 https://api.spotify.com/v1/tracks/1d7Ptw3qYcfpdLNL5REhtJ
## 5 https://api.spotify.com/v1/tracks/5vNRhkKd0yEAg8suGBpjeY
## 6 https://api.spotify.com/v1/tracks/0WbMK4wrZ1wFSty9F7FCgu
##                                    uri          track_album_name
## 1 spotify:track:2plbrEY59IikOBgBGLjaoe          Die With A Smile
## 2 spotify:track:6dOtVTDdiauQNBQEDOtlAB      HIT ME HARD AND SOFT
## 3 spotify:track:7ne4VBA60CxGM75vw0EYad The Secret of Us (Deluxe)
## 4 spotify:track:1d7Ptw3qYcfpdLNL5REhtJ            Short n' Sweet
## 5 spotify:track:5vNRhkKd0yEAg8suGBpjeY                      APT.
## 6 spotify:track:0WbMK4wrZ1wFSty9F7FCgu          Good Luck, Babe!
##      playlist_name
## 1 Today's Top Hits
## 2 Today's Top Hits
## 3 Today's Top Hits
## 4 Today's Top Hits
## 5 Today's Top Hits
## 6 Today's Top Hits
##                                                       analysis_url
## 1 https://api.spotify.com/v1/audio-analysis/2plbrEY59IikOBgBGLjaoe
## 2 https://api.spotify.com/v1/audio-analysis/6dOtVTDdiauQNBQEDOtlAB
## 3 https://api.spotify.com/v1/audio-analysis/7ne4VBA60CxGM75vw0EYad
## 4 https://api.spotify.com/v1/audio-analysis/1d7Ptw3qYcfpdLNL5REhtJ
## 5 https://api.spotify.com/v1/audio-analysis/5vNRhkKd0yEAg8suGBpjeY
## 6 https://api.spotify.com/v1/audio-analysis/0WbMK4wrZ1wFSty9F7FCgu
##                 track_id         track_name track_album_release_date
## 1 2plbrEY59IikOBgBGLjaoe   Die With A Smile               2024-08-16
## 2 6dOtVTDdiauQNBQEDOtlAB BIRDS OF A FEATHER               2024-05-17
## 3 7ne4VBA60CxGM75vw0EYad     That’s So True               2024-10-18
## 4 1d7Ptw3qYcfpdLNL5REhtJ              Taste               2024-08-23
## 5 5vNRhkKd0yEAg8suGBpjeY               APT.               2024-10-18
## 6 0WbMK4wrZ1wFSty9F7FCgu   Good Luck, Babe!               2024-04-05
##   instrumentalness         track_album_id mode key duration_ms acousticness
## 1           0.0000 10FLjwfpbxLmW8c25Xyc2N    0   6      251668       0.3080
## 2           0.0608 7aJuG4TFXa2hmE4z1yxc3n    1   2      210373       0.2000
## 3           0.0000 0hBRqPYPXhr1RkTDG3n4Mk    1   1      166300       0.2140
## 4           0.0000 4B4Elma4nNDUyl6D5PvQkj    0   0      157280       0.0939
## 5           0.0000 2IYQwwgxgOIn7t3iF6ufFD    0   0      169917       0.0283
## 6           0.0000 1WAjjRMfZjEXtB0lQrAw6Q    0  11      218424       0.0502
##                       id playlist_subgenre           type
## 1 2plbrEY59IikOBgBGLjaoe        mainstream audio_features
## 2 6dOtVTDdiauQNBQEDOtlAB        mainstream audio_features
## 3 7ne4VBA60CxGM75vw0EYad        mainstream audio_features
## 4 1d7Ptw3qYcfpdLNL5REhtJ        mainstream audio_features
## 5 5vNRhkKd0yEAg8suGBpjeY        mainstream audio_features
## 6 0WbMK4wrZ1wFSty9F7FCgu        mainstream audio_features
##              playlist_id
## 1 37i9dQZF1DXcBWIGoYBM5M
## 2 37i9dQZF1DXcBWIGoYBM5M
## 3 37i9dQZF1DXcBWIGoYBM5M
## 4 37i9dQZF1DXcBWIGoYBM5M
## 5 37i9dQZF1DXcBWIGoYBM5M
## 6 37i9dQZF1DXcBWIGoYBM5M
#View(spotify_data)

The first plot I made, it felt too hard to read as 4800 points on a graph with so many colored genres wasn’t readable, and was unused. I wasn’t able to make any relationships between the data points.

# The first plot I made, it felt too hard to read as 4800 points on a graph with so many colored genres wasn't readable, and was unused.
plot <- ggplot(spotify_data, aes(x = energy, y = tempo, color = playlist_genre)) +
  geom_point()
show(plot)
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

Original skeletal boxplot of track popularity by genre, it was interesting to see how pop songs were so diverse in popularity and how scarce songs were above 80 popularity in most catagories.

boxplot <- ggplot(spotify_data, aes(x = playlist_genre, y = track_popularity)) +
  geom_boxplot() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))  # hard to read otherwise
show(boxplot)

I ended up remaking the last graph to look nicer for the presentation as well as make outliers more apparent. I’m unsure if I was able to get the options command to work but ended up saving the graph with the necessary width to give the text enough breathing room.

#added for readability of the next plot.
options(repr.plot.width = 100, repr.plot.height = 6)
boxplot <- ggplot(spotify_data, aes(x = playlist_genre, y = track_popularity, fill = playlist_genre)) +
  geom_boxplot(outlier.shape = 21, outlier.fill = "red", outlier.size = 2, alpha = 0.7) +
  theme_minimal(base_size = 14) + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1, size = 12),
    axis.title = element_text(face = "bold"),
    legend.position = "none"
  ) +
  labs(
    x = "Playlist Genre",
    y = "Track Popularity"
  ) 
show(boxplot)

ggsave("boxplot_wide.png", plot = boxplot, width = 13.5, height = 6, dpi = 300)

Here is where I created the t-test on whether the electronic genre has more average track popularity than lofi track popularity. There was a very significant difference between the average track popularity and the p-value was extremely low.

Here is the correlation heatmap between each other, some make a lot of sense like a strong correlation between a song being instrumental and acoustic, and higher energy songs being louder, but others made less sense to me like songs that are sad or angry given lower valence scores tend to be quieter. Liveness and tempo ended up having little or no correlation to any of these any of these other variables.

# Visualize correlation using a heatmap
ggcorrplot(cor_matrix, method = "circle", type = "lower", lab = TRUE, lab_size = 2.5)

I also wanted to find how each of these variable types effected track popularity by genre. It helped me find that classical and wellness were the least danceable. Classical, Jazz, Lofi, and Wellness were found to be the most instumental by wide margins. Gospel and disco songs are much longer songs than other genres. Wellness was by far the most quiet songs out of the genre and it definetely checks out after listening to wellness songs.

  average_features <- group_by(spotify_data, playlist_genre) %>%
  summarise(
    avg_energy = mean(energy, na.rm = TRUE),
    avg_tempo = mean(tempo, na.rm = TRUE),
    avg_danceability = mean(danceability, na.rm = TRUE),
    avg_loudness = mean(loudness, na.rm = TRUE),
    avg_liveness = mean(liveness, na.rm = TRUE),
    avg_valence = mean(valence, na.rm = TRUE),
    avg_speechiness = mean(speechiness, na.rm = TRUE),
    avg_instrumentalness = mean(instrumentalness, na.rm = TRUE),
    avg_duration_ms = mean(duration_ms, na.rm = TRUE),
    avg_acousticness = mean(acousticness, na.rm = TRUE)
  )

str(average_features)
## tibble [35 × 11] (S3: tbl_df/tbl/data.frame)
##  $ playlist_genre      : chr [1:35] "afrobeats" "ambient" "arabic" "blues" ...
##  $ avg_energy          : num [1:35] 0.691 0.465 0.637 0.505 0.689 ...
##  $ avg_tempo           : num [1:35] 118 110 117 111 118 ...
##  $ avg_danceability    : num [1:35] 0.728 0.549 0.704 0.604 0.728 ...
##  $ avg_loudness        : num [1:35] -7.97 -13.72 -7.49 -8.8 -6.56 ...
##  $ avg_liveness        : num [1:35] 0.148 0.156 0.155 0.157 0.154 ...
##  $ avg_valence         : num [1:35] 0.608 0.431 0.539 0.424 0.742 ...
##  $ avg_speechiness     : num [1:35] 0.1123 0.1346 0.1677 0.0778 0.1242 ...
##  $ avg_instrumentalness: num [1:35] 0.1275 0.2052 0.0915 0.0121 0.0566 ...
##  $ avg_duration_ms     : num [1:35] 264409 185782 172701 255396 193319 ...
##  $ avg_acousticness    : num [1:35] 0.189 0.49 0.321 0.418 0.346 ...
average_features_long <- average_features %>%
  pivot_longer(cols = -playlist_genre, names_to = "feature", values_to = "value")

str(average_features_long)
## tibble [350 × 3] (S3: tbl_df/tbl/data.frame)
##  $ playlist_genre: chr [1:350] "afrobeats" "afrobeats" "afrobeats" "afrobeats" ...
##  $ feature       : chr [1:350] "avg_energy" "avg_tempo" "avg_danceability" "avg_loudness" ...
##  $ value         : num [1:350] 0.691 118.186 0.728 -7.973 0.148 ...
plot_feature <- function(feature_name) {
  ggplot(average_features_long %>% filter(feature == feature_name), 
         aes(x = playlist_genre, y = value, fill = playlist_genre)) +
    geom_bar(stat = "identity", show.legend = FALSE) +
    labs(
      title = paste("Average", gsub("avg_", "", feature_name), "by Genre"),
      x = "Playlist Genre",
      y = "Average Value"
    ) +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 10, color = "black"))
}


feature_names <- unique(average_features_long$feature)
for (feature in feature_names) {
  print(plot_feature(feature))
}

Here is a linear regression between track_popularity and instrumentalness. The intercept ended up being 57.74 so when instrumentalness was 0, track popularity was 57.74 on average. The average of track popularity would decrease as instrumentalness increased, and when instrumentalness was at its maximum of 1 the average would dip to 42.91 on average. This made the regression equation track_popularity = 57.74 - 14.83 * instrumentalness.

Here is a linear regression between acousticness and instrumentalness. The intercept ended up being 0.246112 so when instrumentalness was 0, acousticness was 0.246112 on average. The average acousticness would increase as instrumentalness increased, and when instrumentalness was at its maximum of 1 the average acousticness would rise to 0.719 on average. This made the regression equation acousticness = 0.246 + 0.473 * instrumentalness.

With this relationship I’d believe that there is a strong correlation between a song being instrumental and in turn likely also being acoustic.

lm_model1 <- lm(acousticness ~ instrumentalness, data = spotify_data)

# Display summary of the regression model
summary(lm_model1)
## 
## Call:
## lm(formula = acousticness ~ instrumentalness, data = spotify_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.71205 -0.21482 -0.06111  0.21744  0.74773 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      0.246112   0.004634   53.12   <2e-16 ***
## instrumentalness 0.473034   0.011433   41.37   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2796 on 4828 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.2617, Adjusted R-squared:  0.2616 
## F-statistic:  1712 on 1 and 4828 DF,  p-value: < 2.2e-16
# Visualize the regression with a scatter plot
ggplot(spotify_data, aes(x = instrumentalness, y = acousticness)) +
  geom_point(alpha = 0.5, color = "blue") +  # Scatter plot points
  geom_smooth(method = "lm", color = "red", se = TRUE) +  # Regression line with confidence interval
  theme_minimal(base_size = 14) +
  labs(
    title = "Linear Regression: Instrumentalness vs. Acousticness",
    x = "Instrumentalness",
    y = "Acousticness"
  )
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

another unused graph for its difficult readability

spotify_data <- spotify_data %>%
  mutate(energy_level = ifelse(energy > 0.7, "High", "Low"))



high_energy_tracks <- spotify_data %>%
  filter(energy > 0.7)

ggplot(high_energy_tracks, aes(x = tempo, y = danceability, color = playlist_genre)) +
  geom_point(alpha = 0.6) +
  labs(title = "Danceability vs. Tempo for High-Energy Tracks",
       x = "Tempo (BPM)",
       y = "Danceability",
       color = "Playlist Genre") +
  theme_minimal()

I wanted to find the largest and smallest songs of each catagory, like which song is the loudest or the most energetic and convert it automatically into a table. Its really interesting to listen to just how different these songs are from each other. It was impressive to see REI DO BRASIL - Seek is so loud that it has a positive dB, where 0 signifies the base loudness a song can reach and went above it.

#What songs are the most/least of the values?

high_population_data <- read.csv("high_popularity_spotify_data.csv")
low_population_data <- read.csv("low_popularity_spotify_data.csv")
spotify_data <- rbind(high_population_data, low_population_data)

# For each feature, find the song with the highest and lowest value
find_extremes <- function(feature) {
  # Find the row with the maximum value
  max_row <- spotify_data[which.max(spotify_data[[feature]]), ]
  # Find the row with the minimum value
  min_row <- spotify_data[which.min(spotify_data[[feature]]), ]
  
  # Create a data frame with results
  result <- data.frame(
    feature = feature,
    highest_value_song = max_row$track_name,
    highest_value_artist = max_row$track_artist,
    highest_value = max_row[[feature]],
    lowest_value_song = min_row$track_name,
    lowest_value_artist = min_row$track_artist,
    lowest_value = min_row[[feature]]
  )
  
  return(result)
}


features <- c("energy", "tempo", "danceability", "loudness", "liveness", 
              "valence", "speechiness", "instrumentalness", "duration_ms", 
              "acousticness")


extreme_values <- lapply(features, find_extremes) %>%
  bind_rows()

# Print the result
print(extreme_values)
##             feature
## 1            energy
## 2             tempo
## 3      danceability
## 4          loudness
## 5          liveness
## 6           valence
## 7       speechiness
## 8  instrumentalness
## 9       duration_ms
## 10     acousticness
##                                                       highest_value_song
## 1                                                              Hard Beat
## 2                                                     Lo-fi Love Letters
## 3                                                           Ice Ice Baby
## 4                                                          REI DO BRASIL
## 5                                                           Besame Mucho
## 6                                                                   Stop
## 7                                                                 Ucingo
## 8                                                            Psalm 22.21
## 9  Turn It Up / No Longer a Slave / Made a Way / Ekwueme (Medley) [Live]
## 10                                                      Gnossienne No. 1
##                                         highest_value_artist highest_value
## 1                     TNT, Darren Styles, Technoboy, Tuneboy         0.998
## 2                                                Idris Kelly       241.426
## 3                                                Vanilla Ice         0.979
## 4                                                       Seek         1.318
## 5                                               Dave Brubeck         0.979
## 6                                                     B.W.H.         0.987
## 7  Zee Nxumalo, Sly, GL_Ceejay, Kabza De Small, Shakes & Les         0.927
## 8                                         jung jaeil, VOCES8         0.991
## 9                                                 Big Bolaji   1355260.000
## 10                                  Erik Satie, Alena Cherny         0.996
##    lowest_value_song   lowest_value_artist lowest_value
## 1         Foundation           Matheo Lyon   2.0200e-04
## 2               Deep        Peter Sandberg   4.8232e+01
## 3               Rosy          Misha Burton   5.8900e-02
## 4          Oscalated              Setareha  -4.8069e+01
## 5       Ain't It Fun              Paramore   2.1000e-02
## 6            Somnova             Reso Nata   2.9600e-02
## 7         難得有情人          Shirley Kwan   2.1900e-02
## 8   Die With A Smile Lady Gaga, Bruno Mars   0.0000e+00
## 9     At the Library           LoFi Waiter   3.5375e+04
## 10         Star Pool                  c152   3.5900e-06
write.csv(extreme_values, "extreme_values.csv", row.names = FALSE)

Conclusion

This project allowed me to find a ton of interesting information about track popularity and songs with many different varieties of style. Running statistics on electronic and lofi music allowed me to definitevly know that electronic is more popular on average than lofi music. I learned that Billie Eilish’s music has a large IQR range. I learned how different catagories of music have large effects on energy, loudness, instrumentalness and others as well. Learning how to make graphs to find interesting meanings in the data is something I love to do, I’d love to go into data science as a job and help the world by supplying valuable information. This project was really fun. :)